{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "label_train = pd.read_csv('data/nCoV_100k_train.labled.csv', header=0)\n",
    "unlabel_train = pd.read_csv('data/nCoV_900k_train.unlabled.csv', header=0)\n",
    "test = pd.read_csv('data/nCov_10k_test.csv', header=0)\n",
    "col = ['微博id', '微博中文内容']\n",
    "train = pd.concat([label_train[col], unlabel_train[col]])\n",
    "train = pd.concat([train, test[col]])\n",
    "train.drop_duplicates(subset=['微博中文内容'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a = label_train['微博中文内容']\n",
    "b = test['微博中文内容']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list(set(a) & set(b))[:10]\n",
    "train = pd.read_csv('data/nCoV_100k_train.labled.csv', header=0)\n",
    "train = train[(train['情感倾向'].isin(['-1', '0', '1']))]\n",
    "train['情感倾向'] = train['情感倾向'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# test[test['微博中文内容'] == '我围观了@鲍秀兰诊室的回答，该问题价值120.00元，围观仅1元，快来一起围观~O微博问答?']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = train.drop_duplicates(subset=['微博中文内容'])\n",
    "train = train[train.notnull()][['微博中文内容','情感倾向']]\n",
    "test = test.merge(train,how='inner',on='微博中文内容')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1781, 7), (90109, 2))"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.shape,train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test.to_csv('data/test_repair.csv', index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def seg_sentence_char(sentence):\n",
    "    sentence_seged = sentence.strip()\n",
    "    outstr = ''\n",
    "    for word in sentence_seged:\n",
    "        if word != '\\t':\n",
    "            outstr += word\n",
    "            outstr += \" \"\n",
    "    return outstr\n",
    "\n",
    "# 分词\n",
    "def stopwordslist(filepath):\n",
    "    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]\n",
    "    return stopwords\n",
    "\n",
    "def seg_sentence(sentence):\n",
    "    import jieba_fast as jieba\n",
    "    sentence_seged = jieba.cut(sentence.strip())\n",
    "    outstr = ''\n",
    "    for word in sentence_seged:\n",
    "        if word != '\\t':\n",
    "            outstr += word\n",
    "            outstr += \" \"\n",
    "    return outstr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache /tmp/jieba.cache\n",
      "Loading model cost 2.402 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "# train['content'] = train['微博中文内容'].astype('str').apply(seg_sentence_char)\n",
    "train['content_word'] = train['微博中文内容'].astype('str').apply(seg_sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/appcom/AnacondaInstall/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "train['content'].to_csv('data/content.txt', index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/appcom/AnacondaInstall/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "train['content_word'].to_csv('data/content_word.txt', index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/appcom/AnacondaInstall/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    " train['微博中文内容'].astype('str').to_csv('data/content_all_test.txt', index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}